in this document I creat a new version for the flowcam classifier at 18 degrees in december 2021. There are new classes: 1 for small unidentified particles smaller than 10, which might be small digested algae and 1 for dividing chlamydomonas. I also cleaned the species libraries by eliminating particles that were outliers in any variables. This eliminated about 55 of the data
library(tidyverse)
library(randomForest)
library(readr)
library(viridis)
library(e1071)
library(caret)
set.seed(1)
colnames <- c("Particle_ID","Area_ABD","Area_Filled","Aspect_Ratio","Average_Blue",
"Average_Green","Average_Red","Calibration_Factor","Calibration_Image",
"Camera","Capture_X","Capture_Y","Ch1_Area","Ch1_Peak","Ch1_Width",
"Ch2_Area","Ch2_Peak","Ch2_Width","Ch2_Ch1_Ratio","Circle_Fit",
"Circularity","Circularity_Hu","Compactness","Convex_Perimeter",
"Convexity","Date","Diameter_ABD","Diameter_ESD","Edge_Gradient",
"Elongation","Feret_Angle_Max","Feret_Angle_Min","Fiber_Curl",
"Fiber_Straightness","Filter_Score","Geodesic_Aspect_Ratio",
"Geodesic_Length","Geodesic_Thickness","Image_File","Image_Height",
"Image_Width","Image_X","Image_Y","Intensity","Length",
"Particles_Per_Chain","Perimeter","Ratio_Blue_Green","Ratio_Red_Blue",
"Ratio_Red_Green","Roughness","Scatter_Area","Scatter_Peak","Scatter_Width",
"Sigma_Intensity","Source_Image","Sum_Intensity","Symmetry","Time",
"Timestamp","Transparency","Volume_ABD","Volume_ESD","Width","species")
dd_all <- read_csv("Data/libraries_FC100/dd_all.csv", show_col_types = FALSE)
dd_all$species <- ifelse(dd_all$species=="small_unidentified_possibly_small_digested_algae", "Small_unidentified",dd_all$species)
table(dd_all$species)
dd_all$id <- 1:nrow(dd_all)
dd_all_long <- dd_all %>%
dplyr::select(species, Area_ABD,Area_Filled,Aspect_Ratio,Average_Blue,
Average_Green,Average_Red,Circle_Fit,Circularity,
Compactness,Convexity,Diameter_ABD,Diameter_ESD,Edge_Gradient,
Elongation,Feret_Angle_Max,Feret_Angle_Min,Fiber_Curl,
Fiber_Straightness,Geodesic_Aspect_Ratio,Intensity,
Length,Perimeter,Ratio_Blue_Green,Ratio_Red_Blue,
Ratio_Red_Green,Roughness,Sigma_Intensity,Sum_Intensity,
Symmetry,Transparency,Width, id) %>%
pivot_longer(cols=-c(id,species), names_to="variable") %>%
dplyr::group_by(variable,species) %>%
mutate(iqr = IQR(value),
first_quartile = quantile(value, probs = 0.25),
third_quartile = quantile(value, probs = 0.75),
outlier = ifelse(value<first_quartile-1.5*iqr | value>third_quartile+1.5*iqr,T,F))
outliers <- dd_all_long %>%
dplyr::filter(outlier==T) %>%
dplyr::group_by(species, id) %>%
dplyr::summarise(n = n()) %>%
dplyr::filter(n>6)
## `summarise()` has grouped output by 'species'. You can override using the `.groups` argument.
table(outliers$species)
##
## airbubbles Chlamydomonas ChlamydomonasClumps
## 15 42 9
## Coleps_irchel Colpidium ColpidiumVacuoles
## 39 11 31
## Cosmarium Cryptomonas Debris
## 20 37 10
## Desmodesmus Dexiostoma DigestedAlgae
## 73 18 5
## DividingChlamydomonas Loxocephallus Monoraphidium
## 13 18 83
## OtherCiliate Small_unidentified Staurastrum1
## 1 219 29
## Staurastrum2 Tetrahymena
## 3 9
nrow(outliers)/nrow(dd_all)
## [1] 0.05332399
dd_all_filtered <- dd_all %>%
dplyr::filter(!is.element(id,outliers$id))
dd_all$removed_outliers <- F
dd_all_filtered$removed_outliers <- T
dd_all_comparison <- rbind(dd_all,dd_all_filtered)
dd_all_comparison %>%
ggplot(aes(log10(Area_ABD), col=removed_outliers))+
geom_histogram(aes(fill=removed_outliers, y=..density..), position = "identity", alpha=0.3) +
geom_boxplot(outlier.alpha = 0.3) +
theme_bw() +
facet_wrap(~species, scales = "free_y") +
geom_vline(xintercept = 1, col="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
dd_all_filtered %>%
ggplot(aes(log10(Area_ABD)))+
geom_density(aes(col=species))
species <- unique(dd_all$species)
species <- species[!is.element(species,c("airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
"OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))]
compositions <- read_csv("../compositions.csv")
## Rows: 15 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): composition
## dbl (19): richness, Chlamydomonas, Cryptomonas, Monoraphidium, Cosmarium, St...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
comp_name <- compositions$composition
compositions <- compositions[,species]
compositions_list <- apply(compositions, 1, function(x){
idx <- which(x==1)
names(idx)
})
names(compositions_list) <- comp_name
classifier_plot_svm <- function(table, combination.nr){
# cm <- classifier$confusion
cm <- table
ncol <- ncol(cm)
cm <- apply(cm, 1, function(x) x/sum(x))
cm_long <- data.frame(Predicted = factor(rep(rownames(cm),ncol), levels = rownames(cm)),
Truth = factor(rep(colnames(cm), each=ncol), levels = colnames(cm)),
Classification = as.vector(cm))
plot <- cm_long %>%
ggplot(aes(Predicted,Truth,fill=Classification))+
geom_tile() +
geom_text(aes(label = round(Classification, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(cm_long$Truth))) +
scale_fill_viridis(option = "D", end=.8, discrete=FALSE)+
theme(axis.text.x = element_text(angle = 90, hjust = 0))+
theme(legend.text = element_text(size=14),
axis.title = element_text(size=14),
title = element_text(size=16),
axis.text = element_text(size=13))+
labs(title=paste("PPV of",combination.nr),fill="PPV")
return(plot)
}
classifier_assessment_plot <- function(cf, combination.nr){
cf.df <- cf$byClass[,1:4] %>%
as.data.frame() %>%
mutate(Species = gsub("Class: ", "", rownames(cf$byClass[,1:5]))) %>%
rename("NPV" = "Neg Pred Value", "PPV" = "Pos Pred Value",
"Sens" = "Sensitivity", "Spec" = "Specificity") %>%
pivot_longer(cols = 1:4) %>%
mutate(col = ifelse(value>=0.9,"1",
ifelse(value<0.9 & value>=0.8,"2",
ifelse(value<0.8 & value>=0.7,"3","4"))))
plot <- cf.df %>%
ggplot(aes(name,Species,fill=col))+
geom_tile() +
geom_text(aes(label = round(value, 3)), col="white") +
scale_x_discrete(position = "top") +
scale_y_discrete(limits = rev(levels(as.factor(cf.df$Species)))) +
scale_fill_manual(values = c("#006837","#66bd63","#f46d43","#a50026"), breaks = c("1","2","3","4")) +
theme(legend.text = element_text(size=14),
title = element_text(size = 16),
axis.title = element_blank(),
axis.text = element_text(size=13),
legend.position = "none")+
labs(title=combination.nr, fill="")
return(plot)
}
formula <- factor(species) ~ Area_ABD + Area_Filled + Aspect_Ratio +
Average_Blue + Average_Green + Average_Red + Circle_Fit +
Circularity + Compactness +
Convexity + Diameter_ABD + Diameter_ESD + Edge_Gradient +
Elongation + Feret_Angle_Max + Feret_Angle_Min + Fiber_Curl +
Fiber_Straightness + Geodesic_Aspect_Ratio + Intensity + Length +
Perimeter + Ratio_Blue_Green + Ratio_Red_Blue + Ratio_Red_Green +
Roughness + Sigma_Intensity + Sum_Intensity + Symmetry + Transparency +
Width
set.seed(62)
classifiers_18c <- list()
classifiers_18c_plots <- list()
classifiers_18c_assess_plots <- list()
trainingdata <- dd_all[complete.cases(dd_all), ]
for(i in 1:length(compositions_list)){
if("Colpidium" %in% compositions_list[[i]]) {
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","ColpidiumVacuoles","Debris","Small_unidentified",
"OtherCiliate","ChlamydomonasClumps","DigestedAlgae","DividingChlamydomonas"))
} else {
sub_trainingdata <- trainingdata %>%
filter(species %in% c(compositions_list[[i]],"airbubbles","Debris","OtherCiliate","ChlamydomonasClumps","Small_unidentified",
"DigestedAlgae","DividingChlamydomonas"))
}
n.var <- length(unique(sub_trainingdata$species))
sub_trainingdata$species <- factor(sub_trainingdata$species)
split_up <- split(sub_trainingdata, f = sub_trainingdata$species)
subsamples <- lapply(split_up, function(x) {
x %>% sample_n(ifelse(nrow(x) < 500, nrow(x), 500))})
sub_trainingdata <- do.call("rbind", subsamples)
# table(sub_trainingdata$species)
# A stratified random split of the data
idx_train <- createDataPartition(sub_trainingdata$species,
p = 0.7, # percentage of data as training
list = FALSE)
sub_testdata <- sub_trainingdata[-idx_train,]
sub_trainingdata <- sub_trainingdata[idx_train,]
n.min <- min(table(sub_trainingdata$species))
obj <- tune(svm, formula, data = sub_trainingdata, kernel="radial",
ranges = list(gamma = 2^(-8:2), cost = 2^(1:10)), probability = TRUE,
tunecontrol = tune.control(sampling = "fix", best.model = T))
classifiers_18c[[i]] <- obj$best.model
cf <- caret::confusionMatrix(predict(obj$best.model, newdata=sub_testdata %>% select(-species)),
sub_testdata$species)
classifiers_18c_plots[[i]] <- classifier_plot_svm(table = cf$table,
combination.nr = names(compositions_list)[[i]])
classifiers_18c_assess_plots[[i]] <- classifier_assessment_plot(cf, names(compositions_list)[[i]])
}
names(classifiers_18c_plots) <- names(classifiers_18c) <-
names(classifiers_18c_assess_plots) <- comp_name
library(patchwork)
for(i in 1:length(classifiers_18c_plots)){
print(classifiers_18c_plots[[i]] + classifiers_18c_assess_plots[[i]] +
plot_layout(widths = c(7,3)))
}
saveRDS(classifiers_18c, "svm_flowcam_classifiers_18c_trained_december2021.rds")
# cl <- readRDS("classifiers_18c_25x.rds")
# labels <- dd_all %>%
# group_by(species) %>%
# summarise(xPos = max(Area_Filled),
# yPos = max((density(Area_Filled))$y))
#
# library(directlabels)
# direct.label(qplot(Area_Filled,data=dd_all,colour=species,geom="density",log="x"))